# Reading in "control genotype" data
megamuga_controls <- data.table::fread("../data/MMControls/MegaMUGA genotypes CONTROLS.csv")
control_genotypes <- data.table::fread("../data/MMControls/control.genotypes.csv")
## Warning in data.table::fread("../data/MMControls/control.genotypes.csv"):
## Detected 364 column names but the data has 365 columns (i.e. invalid file).
## Added 1 extra default column name for the first column which is guessed to be
## row names or an index. Use setnames() afterwards if this guess is not correct,
## or fix the file write command that created the file to create a valid file.
colnames(control_genotypes)[1] <- "marker"
## Reading in marker annotations
mm_metadata <- data.table::fread("../data/MMControls/mm_uwisc_v2.csv")
## Calculating allele frequencies for each marker
control_allele_freqs <- control_genotypes %>%
tidyr::pivot_longer(-marker, names_to = "sample", values_to = "genotype") %>%
dplyr::group_by(marker, genotype) %>%
dplyr::summarise(n = n()) %>%
dplyr::mutate(freq = round(n/sum(n), 3),
genotype = as.factor(genotype)) %>%
dplyr::left_join(., mm_metadata)
First, we searched for probes where many mice are missing genotype calls.
## Filtering to markers with missing genotypes
no.calls <- control_allele_freqs %>%
dplyr::filter(genotype == "N") %>%
tidyr::pivot_wider(names_from = genotype, values_from = n) %>%
dplyr::select(marker, chr, bp_grcm39, freq) %>%
dplyr::mutate(chr = as.factor(chr))
cutoff <- quantile(no.calls$freq, probs = seq(0,1,0.05))[[20]]
above.cutoff <- no.calls %>%
dplyr::filter(freq > cutoff)
Of 77808 markers, 54990 failed to genotype at least one sample, and 2750 markers failed to genotype at least 12.765% of samples.
In a similar fashion, we calculated the number of reference samples with missing genotypes. Repeated observations of samples/strains meant that genotype counts for each marker among them couldn’t be grouped and tallied, so determining no-call frequency occurred column-wise.
n.calls.strains <- apply(X = control_genotypes[,2:ncol(control_genotypes)], MARGIN = 2, function(x) table(x)[5])
n.calls.strains.df <- data.frame(n.calls.strains)
n.calls.strains.df$sample <- names(n.calls.strains)
n.calls.strains.df %<>%
dplyr::rename(n.no.calls = n.calls.strains)
sampleQC <- ggplot(n.calls.strains.df,
mapping = aes(x = reorder(sample,n.no.calls),
y = n.no.calls,
text = paste("Sample:", sample))) +
geom_point() +
QCtheme +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
labs(x = "Number of mice with missing genotypes",
y = "Number of markers")
ggplotly(sampleQC, tooltip = c("text","y"))